import re
import fitz

from pypdf import PdfReader

"""
https://github.com/py-pdf/pypdf 
https://github.com/pdfminer/pdfminer.six 
"""


def clean(text):
    pattern = re.compile("[^\u4e00-\u9fa5,.!0-9\n]")
    return re.sub(pattern, '', text).strip('\n')


def pypdf_pdf2text(path):
    """
    无法解析的内容，返回错乱字符
    """
    reader = PdfReader(path)
    data = [clean(page.extract_text()) for page in reader.pages]
    return "\n".join(data)


def fitz_pdf2text(filename: str = None, stream: bytes = None):
    """
    速度相对较慢，但是能解析出更多的内容：水印、目录
    """
    with fitz.open(filename=filename, stream=stream) as doc:
        data = [page.get_text() for page in doc]
        return "\n".join(data)


def pdf2text(file):
    try:
        if type(file) is bytes:
            return fitz_pdf2text(stream=file)
        return fitz_pdf2text(filename=file)
    except Exception as exc:
        print(exc)
        return pypdf_pdf2text(file)


if __name__ == '__main__':
    t_path = "../条款1.pdf"

    print(pdf2text(t_path))
